package zxk.spider.webmagic.processor;

import org.jsoup.Jsoup;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 * @fileName: PatentPageProcessor
 * @author: 舟小亢
 * @date: 2022-11-5 11:55:40
 * @description: 专利审查协作中心页面爬取
 */
@Component
public class PatentPageProcessor implements PageProcessor {

    /**
     * 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
     */
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    /**
     * process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
     *
     * @return
     */
    @Override
    public void process(Page page) {
        //爬取招聘公告
        List<Selectable> nodes = page.getHtml().css("div#job").css("div.content ul > li").nodes();
        List<String> strList=new ArrayList<>();
        for (Selectable node : nodes) {
            strList.add(Jsoup.parse(node.toString()).text());
        }
        page.putField("job", strList);
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        Spider.create(new PatentPageProcessor())
                //从"http://www.cnipa-sc.com/"开始抓
                .addUrl("http://www.cnipa-sc.com/")
                //开启5个线程抓取
                .thread(5)
                //启动爬虫
                .run();
    }
}